1 /****************************************************************************
3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/
6 ** This file is part of the utils of the Qt Toolkit.
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** GNU Lesser General Public License Usage
10 ** This file may be used under the terms of the GNU Lesser General Public
11 ** License version 2.1 as published by the Free Software Foundation and
12 ** appearing in the file LICENSE.LGPL included in the packaging of this
13 ** file. Please review the following information to ensure the GNU Lesser
14 ** General Public License version 2.1 requirements will be met:
15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
17 ** In addition, as a special exception, Nokia gives you certain additional
18 ** rights. These rights are described in the Nokia Qt LGPL Exception
19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
21 ** GNU General Public License Usage
22 ** Alternatively, this file may be used under the terms of the GNU General
23 ** Public License version 3.0 as published by the Free Software Foundation
24 ** and appearing in the file LICENSE.GPL included in the packaging of this
25 ** file. Please review the following information to ensure the GNU General
26 ** Public License version 3.0 requirements will be met:
27 ** http://www.gnu.org/copyleft/gpl.html.
30 ** Alternatively, this file may be used in accordance with the terms and
31 ** conditions contained in a signed written agreement between you and Nokia.
40 ****************************************************************************/
45 #include <qbytearray.h>
51 #include <private/qunicodetables_p.h>
54 #define DATA_VERSION_S "6.1"
55 #define DATA_VERSION_STR "QChar::Unicode_6_1"
58 static QHash<QByteArray, QChar::UnicodeVersion> age_map;
60 static void initAgeMap()
63 const QChar::UnicodeVersion version;
66 { QChar::Unicode_1_1, "1.1" },
67 { QChar::Unicode_2_0, "2.0" },
68 { QChar::Unicode_2_1_2, "2.1" },
69 { QChar::Unicode_3_0, "3.0" },
70 { QChar::Unicode_3_1, "3.1" },
71 { QChar::Unicode_3_2, "3.2" },
72 { QChar::Unicode_4_0, "4.0" },
73 { QChar::Unicode_4_1, "4.1" },
74 { QChar::Unicode_5_0, "5.0" },
75 { QChar::Unicode_5_1, "5.1" },
76 { QChar::Unicode_5_2, "5.2" },
77 { QChar::Unicode_6_0, "6.0" },
78 { QChar::Unicode_6_1, "6.1" },
79 { QChar::Unicode_Unassigned, 0 }
83 age_map.insert(d->age, d->version);
89 static QHash<QByteArray, QChar::Category> categoryMap;
91 static void initCategoryMap()
97 { QChar::Mark_NonSpacing, "Mn" },
98 { QChar::Mark_SpacingCombining, "Mc" },
99 { QChar::Mark_Enclosing, "Me" },
101 { QChar::Number_DecimalDigit, "Nd" },
102 { QChar::Number_Letter, "Nl" },
103 { QChar::Number_Other, "No" },
105 { QChar::Separator_Space, "Zs" },
106 { QChar::Separator_Line, "Zl" },
107 { QChar::Separator_Paragraph, "Zp" },
109 { QChar::Other_Control, "Cc" },
110 { QChar::Other_Format, "Cf" },
111 { QChar::Other_Surrogate, "Cs" },
112 { QChar::Other_PrivateUse, "Co" },
113 { QChar::Other_NotAssigned, "Cn" },
115 { QChar::Letter_Uppercase, "Lu" },
116 { QChar::Letter_Lowercase, "Ll" },
117 { QChar::Letter_Titlecase, "Lt" },
118 { QChar::Letter_Modifier, "Lm" },
119 { QChar::Letter_Other, "Lo" },
121 { QChar::Punctuation_Connector, "Pc" },
122 { QChar::Punctuation_Dash, "Pd" },
123 { QChar::Punctuation_Open, "Ps" },
124 { QChar::Punctuation_Close, "Pe" },
125 { QChar::Punctuation_InitialQuote, "Pi" },
126 { QChar::Punctuation_FinalQuote, "Pf" },
127 { QChar::Punctuation_Other, "Po" },
129 { QChar::Symbol_Math, "Sm" },
130 { QChar::Symbol_Currency, "Sc" },
131 { QChar::Symbol_Modifier, "Sk" },
132 { QChar::Symbol_Other, "So" },
133 { QChar::Other_NotAssigned, 0 }
137 categoryMap.insert(c->name, c->cat);
143 static QHash<QByteArray, QChar::Decomposition> decompositionMap;
145 static void initDecompositionMap()
148 QChar::Decomposition dec;
150 } decompositions[] = {
151 { QChar::Canonical, "<canonical>" },
152 { QChar::Font, "<font>" },
153 { QChar::NoBreak, "<noBreak>" },
154 { QChar::Initial, "<initial>" },
155 { QChar::Medial, "<medial>" },
156 { QChar::Final, "<final>" },
157 { QChar::Isolated, "<isolated>" },
158 { QChar::Circle, "<circle>" },
159 { QChar::Super, "<super>" },
160 { QChar::Sub, "<sub>" },
161 { QChar::Vertical, "<vertical>" },
162 { QChar::Wide, "<wide>" },
163 { QChar::Narrow, "<narrow>" },
164 { QChar::Small, "<small>" },
165 { QChar::Square, "<square>" },
166 { QChar::Compat, "<compat>" },
167 { QChar::Fraction, "<fraction>" },
168 { QChar::NoDecomposition, 0 }
170 Dec *d = decompositions;
172 decompositionMap.insert(d->name, d->dec);
178 static QHash<QByteArray, QChar::Direction> directionMap;
180 static void initDirectionMap()
183 QChar::Direction dir;
186 { QChar::DirL, "L" },
187 { QChar::DirR, "R" },
188 { QChar::DirEN, "EN" },
189 { QChar::DirES, "ES" },
190 { QChar::DirET, "ET" },
191 { QChar::DirAN, "AN" },
192 { QChar::DirCS, "CS" },
193 { QChar::DirB, "B" },
194 { QChar::DirS, "S" },
195 { QChar::DirWS, "WS" },
196 { QChar::DirON, "ON" },
197 { QChar::DirLRE, "LRE" },
198 { QChar::DirLRO, "LRO" },
199 { QChar::DirAL, "AL" },
200 { QChar::DirRLE, "RLE" },
201 { QChar::DirRLO, "RLO" },
202 { QChar::DirPDF, "PDF" },
203 { QChar::DirNSM, "NSM" },
204 { QChar::DirBN, "BN" },
209 directionMap.insert(d->name, d->dir);
226 static QHash<QByteArray, Joining> joining_map;
228 static void initJoiningMap()
234 { Joining_None, "U" },
235 { Joining_Left, "L" },
236 { Joining_Causing, "C" },
237 { Joining_Dual, "D" },
238 { Joining_Right, "R" },
239 { Joining_Transparent, "T" },
240 { Joining_Unassigned, 0 }
242 JoiningList *d = joinings;
244 joining_map.insert(d->name, d->joining);
250 static const char *grapheme_break_string =
251 "enum GraphemeBreak {\n"
252 " GraphemeBreakOther,\n"
253 " GraphemeBreakCR,\n"
254 " GraphemeBreakLF,\n"
255 " GraphemeBreakControl,\n"
256 " GraphemeBreakExtend,\n"
257 " GraphemeBreakPrepend,\n"
258 " GraphemeBreakSpacingMark,\n"
262 " GraphemeBreakLV,\n"
263 " GraphemeBreakLVT\n"
270 GraphemeBreakControl,
272 GraphemeBreakPrepend,
273 GraphemeBreakSpacingMark,
280 , GraphemeBreak_Unassigned
283 static QHash<QByteArray, GraphemeBreak> grapheme_break_map;
285 static void initGraphemeBreak()
287 struct GraphemeBreakList {
291 { GraphemeBreakOther, "Other" },
292 { GraphemeBreakCR, "CR" },
293 { GraphemeBreakLF, "LF" },
294 { GraphemeBreakControl, "Control" },
295 { GraphemeBreakExtend, "Extend" },
296 { GraphemeBreakPrepend, "Prepend" },
297 { GraphemeBreakSpacingMark, "SpacingMark" },
298 { GraphemeBreakL, "L" },
299 { GraphemeBreakV, "V" },
300 { GraphemeBreakT, "T" },
301 { GraphemeBreakLV, "LV" },
302 { GraphemeBreakLVT, "LVT" },
303 { GraphemeBreak_Unassigned, 0 }
305 GraphemeBreakList *d = breaks;
307 grapheme_break_map.insert(d->name, d->brk);
313 static const char *word_break_string =
318 " WordBreakNewline,\n"
319 " WordBreakFormat,\n"
320 " WordBreakKatakana,\n"
321 " WordBreakALetter,\n"
322 " WordBreakMidNumLet,\n"
323 " WordBreakMidLetter,\n"
324 " WordBreakMidNum,\n"
325 " WordBreakNumeric,\n"
326 " WordBreakExtendNumLet\n"
341 WordBreakExtendNumLet
343 , WordBreak_Unassigned
346 static QHash<QByteArray, WordBreak> word_break_map;
348 static void initWordBreak()
350 struct WordBreakList {
354 { WordBreakOther, "Other" },
355 { WordBreakCR, "CR" },
356 { WordBreakLF, "LF" },
357 { WordBreakNewline, "Newline" },
358 { WordBreakFormat, "Extend" },
359 { WordBreakFormat, "Format" },
360 { WordBreakKatakana, "Katakana" },
361 { WordBreakALetter, "ALetter" },
362 { WordBreakMidNumLet, "MidNumLet" },
363 { WordBreakMidLetter, "MidLetter" },
364 { WordBreakMidNum, "MidNum" },
365 { WordBreakNumeric, "Numeric" },
366 { WordBreakExtendNumLet, "ExtendNumLet" },
367 { WordBreak_Unassigned, 0 }
369 WordBreakList *d = breaks;
371 word_break_map.insert(d->name, d->brk);
377 static const char *sentence_break_string =
378 "enum SentenceBreak {\n"
379 " SentenceBreakOther,\n"
380 " SentenceBreakCR,\n"
381 " SentenceBreakLF,\n"
382 " SentenceBreakSep,\n"
383 " SentenceBreakFormat,\n"
384 " SentenceBreakSp,\n"
385 " SentenceBreakLower,\n"
386 " SentenceBreakUpper,\n"
387 " SentenceBreakOLetter,\n"
388 " SentenceBreakNumeric,\n"
389 " SentenceBreakATerm,\n"
390 " SentenceBreakSContinue,\n"
391 " SentenceBreakSTerm,\n"
392 " SentenceBreakClose\n"
404 SentenceBreakOLetter,
405 SentenceBreakNumeric,
407 SentenceBreakSContinue,
411 , SentenceBreak_Unassigned
414 static QHash<QByteArray, SentenceBreak> sentence_break_map;
416 static void initSentenceBreak()
418 struct SentenceBreakList {
422 { SentenceBreakOther, "Other" },
423 { SentenceBreakCR, "CR" },
424 { SentenceBreakLF, "LF" },
425 { SentenceBreakSep, "Sep" },
426 { SentenceBreakFormat, "Extend" },
427 { SentenceBreakFormat, "Format" },
428 { SentenceBreakSp, "Sp" },
429 { SentenceBreakLower, "Lower" },
430 { SentenceBreakUpper, "Upper" },
431 { SentenceBreakOLetter, "OLetter" },
432 { SentenceBreakNumeric, "Numeric" },
433 { SentenceBreakATerm, "ATerm" },
434 { SentenceBreakSContinue, "SContinue" },
435 { SentenceBreakSTerm, "STerm" },
436 { SentenceBreakClose, "Close" },
437 { SentenceBreak_Unassigned, 0 }
439 SentenceBreakList *d = breaks;
441 sentence_break_map.insert(d->name, d->brk);
447 static const char *line_break_class_string =
448 "// see http://www.unicode.org/reports/tr14/tr14-28.html\n"
449 "// we don't use the XX and AI classes and map them to AL instead.\n"
450 "enum LineBreakClass {\n"
451 " LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,\n"
452 " LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,\n"
453 " LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,\n"
454 " LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,\n"
455 " LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,\n"
456 " LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_CB, LineBreak_SA,\n"
457 " LineBreak_SG, LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
460 enum LineBreakClass {
461 LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,
462 LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,
463 LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,
464 LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,
465 LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,
466 LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_CB, LineBreak_SA,
467 LineBreak_SG, LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK
469 , LineBreak_Unassigned
472 static QHash<QByteArray, LineBreakClass> line_break_map;
474 static void initLineBreak()
476 // ### Classes XX and AI are left out and mapped to AL for now.
477 // ### Class NL is mapped to BK.
478 // ### Treating characters of class CJ as class NS will give CSS strict line breaking;
479 // treating them as class ID will give CSS normal breaking.
480 struct LineBreakList {
484 { LineBreak_BK, "BK" },
485 { LineBreak_CR, "CR" },
486 { LineBreak_LF, "LF" },
487 { LineBreak_CM, "CM" },
488 { LineBreak_BK, "NL" },
489 { LineBreak_SG, "SG" },
490 { LineBreak_WJ, "WJ" },
491 { LineBreak_ZW, "ZW" },
492 { LineBreak_GL, "GL" },
493 { LineBreak_SP, "SP" },
494 { LineBreak_B2, "B2" },
495 { LineBreak_BA, "BA" },
496 { LineBreak_BB, "BB" },
497 { LineBreak_HY, "HY" },
498 { LineBreak_CB, "CB" },
499 { LineBreak_NS, "CJ" },
500 { LineBreak_CL, "CL" },
501 { LineBreak_CP, "CP" },
502 { LineBreak_EX, "EX" },
503 { LineBreak_IN, "IN" },
504 { LineBreak_NS, "NS" },
505 { LineBreak_OP, "OP" },
506 { LineBreak_QU, "QU" },
507 { LineBreak_IS, "IS" },
508 { LineBreak_NU, "NU" },
509 { LineBreak_PO, "PO" },
510 { LineBreak_PR, "PR" },
511 { LineBreak_SY, "SY" },
512 { LineBreak_AL, "AI" },
513 { LineBreak_AL, "AL" },
514 { LineBreak_HL, "HL" },
515 { LineBreak_H2, "H2" },
516 { LineBreak_H3, "H3" },
517 { LineBreak_ID, "ID" },
518 { LineBreak_JL, "JL" },
519 { LineBreak_JV, "JV" },
520 { LineBreak_JT, "JT" },
521 { LineBreak_SA, "SA" },
522 { LineBreak_AL, "XX" },
523 { LineBreak_Unassigned, 0 }
525 LineBreakList *d = breaks;
527 line_break_map.insert(d->name, d->brk);
533 // Keep this one in sync with the code in createPropertyInfo
534 static const char *property_string =
535 "struct Properties {\n"
536 " ushort category : 8; /* 5 used */\n"
537 " ushort direction : 8; /* 5 used */\n"
538 " ushort combiningClass : 8;\n"
539 " ushort joining : 2;\n"
540 " signed short digitValue : 6; /* 5 used */\n"
541 " signed short mirrorDiff : 16;\n"
542 " signed short lowerCaseDiff : 16;\n"
543 " signed short upperCaseDiff : 16;\n"
544 " signed short titleCaseDiff : 16;\n"
545 " signed short caseFoldDiff : 16;\n"
546 " ushort lowerCaseSpecial : 1;\n"
547 " ushort upperCaseSpecial : 1;\n"
548 " ushort titleCaseSpecial : 1;\n"
549 " ushort caseFoldSpecial : 1;\n"
550 " ushort unicodeVersion : 4;\n"
551 " ushort graphemeBreak : 8; /* 4 used */\n"
552 " ushort wordBreak : 8; /* 4 used */\n"
553 " ushort sentenceBreak : 8; /* 4 used */\n"
554 " ushort line_break_class : 8; /* 6 used */\n"
555 " ushort script : 8; /* 5 used */\n"
557 "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
558 "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n"
561 static const char *methods =
562 "Q_CORE_EXPORT GraphemeBreak QT_FASTCALL graphemeBreakClass(uint ucs4);\n"
563 "inline GraphemeBreak graphemeBreakClass(QChar ch)\n"
564 "{ return graphemeBreakClass(ch.unicode()); }\n"
566 "Q_CORE_EXPORT WordBreak QT_FASTCALL wordBreakClass(uint ucs4);\n"
567 "inline WordBreak wordBreakClass(QChar ch)\n"
568 "{ return wordBreakClass(ch.unicode()); }\n"
570 "Q_CORE_EXPORT SentenceBreak QT_FASTCALL sentenceBreakClass(uint ucs4);\n"
571 "inline SentenceBreak sentenceBreakClass(QChar ch)\n"
572 "{ return sentenceBreakClass(ch.unicode()); }\n"
574 "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
575 "inline LineBreakClass lineBreakClass(QChar ch)\n"
576 "{ return lineBreakClass(ch.unicode()); }\n"
578 "Q_CORE_EXPORT Script QT_FASTCALL script(uint ucs4);\n"
579 "inline Script script(QChar ch)\n"
580 "{ return script(ch.unicode()); }\n"
583 static const int SizeOfPropertiesStruct = 20;
585 struct PropertyFlags {
586 bool operator ==(const PropertyFlags &o) {
587 return (combiningClass == o.combiningClass
588 && category == o.category
589 && direction == o.direction
590 && joining == o.joining
592 && digitValue == o.digitValue
593 && mirrorDiff == o.mirrorDiff
594 && lowerCaseDiff == o.lowerCaseDiff
595 && upperCaseDiff == o.upperCaseDiff
596 && titleCaseDiff == o.titleCaseDiff
597 && caseFoldDiff == o.caseFoldDiff
598 && lowerCaseSpecial == o.lowerCaseSpecial
599 && upperCaseSpecial == o.upperCaseSpecial
600 && titleCaseSpecial == o.titleCaseSpecial
601 && caseFoldSpecial == o.caseFoldSpecial
602 && graphemeBreak == o.graphemeBreak
603 && wordBreak == o.wordBreak
604 && sentenceBreak == o.sentenceBreak
605 && line_break_class == o.line_break_class
606 && script == o.script
609 // from UnicodeData.txt
610 uchar combiningClass : 8;
611 QChar::Category category : 5;
612 QChar::Direction direction : 5;
613 // from ArabicShaping.txt
614 QChar::Joining joining : 2;
615 // from DerivedAge.txt
616 QChar::UnicodeVersion age : 4;
625 bool lowerCaseSpecial;
626 bool upperCaseSpecial;
627 bool titleCaseSpecial;
628 bool caseFoldSpecial;
629 GraphemeBreak graphemeBreak;
631 SentenceBreak sentenceBreak;
632 LineBreakClass line_break_class;
637 static QList<int> specialCaseMap;
639 static int appendToSpecialCaseMap(const QList<int> &map)
642 for (int i = 0; i < map.size(); ++i) {
644 if (QChar::requiresSurrogates(val)) {
645 utf16map << QChar::highSurrogate(val);
646 utf16map << QChar::lowSurrogate(val);
651 int length = utf16map.size();
652 utf16map.prepend(length);
654 if (specialCaseMap.isEmpty())
655 specialCaseMap << 0; // placeholder
658 while (i < specialCaseMap.size()) {
659 int n = specialCaseMap.at(i);
662 for (j = 1; j <= n; ++j) {
663 if (specialCaseMap.at(i+j) != utf16map.at(j))
672 int pos = specialCaseMap.size();
673 specialCaseMap << utf16map;
678 UnicodeData(int codepoint = 0) {
679 p.category = QChar::Other_NotAssigned; // Cn
680 p.combiningClass = 0;
682 p.direction = QChar::DirL;
683 // DerivedBidiClass.txt
684 // The unassigned code points that default to AL are in the ranges:
685 // [U+0600..U+07BF, U+08A0..U+08FF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFF, U+1EE00..U+1EEFF]
686 if ((codepoint >= 0x0600 && codepoint <= 0x07BF)
687 || (codepoint >= 0x08A0 && codepoint <= 0x08FF)
688 || (codepoint >= 0xFB50 && codepoint <= 0xFDCF)
689 || (codepoint >= 0xFDF0 && codepoint <= 0xFDFF)
690 || (codepoint >= 0xFE70 && codepoint <= 0xFEFF)
691 || (codepoint >= 0x1EE00 && codepoint <= 0x1EEFF)) {
692 p.direction = QChar::DirAL;
694 // The unassigned code points that default to R are in the ranges:
695 // [U+0590..U+05FF, U+07C0..U+089F, U+FB1D..U+FB4F, U+10800..U+10FFF, U+1E800..U+1EDFF, U+1EF00..U+1EFFF]
696 else if ((codepoint >= 0x0590 && codepoint <= 0x05FF)
697 || (codepoint >= 0x07C0 && codepoint <= 0x089F)
698 || (codepoint >= 0xFB1D && codepoint <= 0xFB4F)
699 || (codepoint >= 0x10800 && codepoint <= 0x10FFF)
700 || (codepoint >= 0x1E800 && codepoint <= 0x1EDFF)
701 || (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) {
702 p.direction = QChar::DirR;
705 p.line_break_class = LineBreak_AL; // XX -> AL
707 // The unassigned code points that default to "ID" include ranges in the following blocks:
708 // [U+3400..U+4DBF, U+4E00..U+9FFF, U+F900..U+FAFF, U+20000..U+2A6DF, U+2A700..U+2B73F, U+2B740..U+2B81F, U+2F800..U+2FA1F, U+20000..U+2FFFD, U+30000..U+3FFFD]
709 if ((codepoint >= 0x3400 && codepoint <= 0x4DBF)
710 || (codepoint >= 0x4E00 && codepoint <= 0x9FFF)
711 || (codepoint >= 0xF900 && codepoint <= 0xFAFF)
712 || (codepoint >= 0x20000 && codepoint <= 0x2A6DF)
713 || (codepoint >= 0x2A700 && codepoint <= 0x2B73F)
714 || (codepoint >= 0x2B740 && codepoint <= 0x2B81F)
715 || (codepoint >= 0x2F800 && codepoint <= 0x2FA1F)
716 || (codepoint >= 0x20000 && codepoint <= 0x2FFFD)
717 || (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {
718 p.line_break_class = LineBreak_ID;
722 decompositionType = QChar::NoDecomposition;
723 p.joining = QChar::OtherJoining;
724 p.age = QChar::Unicode_Unassigned;
731 p.lowerCaseSpecial = 0;
732 p.upperCaseSpecial = 0;
733 p.titleCaseSpecial = 0;
734 p.caseFoldSpecial = 0;
735 p.graphemeBreak = GraphemeBreakOther;
736 p.wordBreak = WordBreakOther;
737 p.sentenceBreak = SentenceBreakOther;
738 p.script = 0; // Common
740 excludedComposition = false;
743 static UnicodeData &valueRef(int codepoint);
747 // from UnicodeData.txt
748 QChar::Decomposition decompositionType;
749 QList<int> decomposition;
751 QList<int> specialFolding;
753 // from BidiMirroring.txt
756 // DerivedNormalizationProps.txt
757 bool excludedComposition;
759 // computed position of unicode property set
763 static QList<UnicodeData> unicodeData;
765 UnicodeData &UnicodeData::valueRef(int codepoint)
767 static bool initialized = false;
769 unicodeData.reserve(QChar::LastValidCodePoint + 1);
770 for (int uc = 0; uc <= QChar::LastValidCodePoint; ++uc)
771 unicodeData.append(UnicodeData(uc));
775 Q_ASSERT(codepoint <= 0x10ffff);
776 return unicodeData[codepoint];
780 static QHash<int, int> decompositionLength;
781 static int highestComposedCharacter = 0;
782 static int numLigatures = 0;
783 static int highestLigature = 0;
790 // we need them sorted after the first component for fast lookup
791 bool operator < (const Ligature &l1, const Ligature &l2)
792 { return l1.u1 < l2.u1; }
794 static QHash<int, QList<Ligature> > ligatureHashes;
796 static QHash<int, int> combiningClassUsage;
798 static int maxLowerCaseDiff = 0;
799 static int maxUpperCaseDiff = 0;
800 static int maxTitleCaseDiff = 0;
802 static void readUnicodeData()
804 qDebug("Reading UnicodeData.txt");
813 UD_DecimalDigitValue,
824 QFile f("data/UnicodeData.txt");
826 qFatal("Couldn't find UnicodeData.txt");
828 f.open(QFile::ReadOnly);
833 int len = f.readLine(line.data(), 1024);
834 line.truncate(len-1);
836 int comment = line.indexOf('#');
838 line = line.left(comment);
842 QList<QByteArray> properties = line.split(';');
844 int codepoint = properties[UD_Value].toInt(&ok, 16);
846 Q_ASSERT(codepoint <= QChar::LastValidCodePoint);
847 int lastCodepoint = codepoint;
849 QByteArray name = properties[UD_Name];
850 if (name.startsWith('<') && name.contains("First")) {
852 nextLine.resize(1024);
853 f.readLine(nextLine.data(), 1024);
854 QList<QByteArray> properties = nextLine.split(';');
855 Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
856 lastCodepoint = properties[UD_Value].toInt(&ok, 16);
858 Q_ASSERT(lastCodepoint <= QChar::LastValidCodePoint);
861 UnicodeData &data = UnicodeData::valueRef(codepoint);
862 data.p.category = categoryMap.value(properties[UD_Category], QChar::Other_NotAssigned);
863 data.p.combiningClass = properties[UD_CombiningClass].toInt();
864 if (!combiningClassUsage.contains(data.p.combiningClass))
865 combiningClassUsage[data.p.combiningClass] = 1;
867 ++combiningClassUsage[data.p.combiningClass];
869 data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
871 if (!properties[UD_UpperCase].isEmpty()) {
872 int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
874 int diff = upperCase - codepoint;
875 if (qAbs(diff) >= (1<<14)) {
876 qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << "); map it for special case";
877 // if the condition below doesn't hold anymore we need to modify our special upper casing code in qchar.cpp
878 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(upperCase));
879 data.p.upperCaseSpecial = true;
880 data.p.upperCaseDiff = appendToSpecialCaseMap(QList<int>() << upperCase);
882 data.p.upperCaseDiff = diff;
883 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff));
885 if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(upperCase)) {
886 // if the conditions below doesn't hold anymore we need to modify our upper casing code
887 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
888 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase));
891 if (!properties[UD_LowerCase].isEmpty()) {
892 int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
894 int diff = lowerCase - codepoint;
895 if (qAbs(diff) >= (1<<14)) {
896 qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << "); map it for special case";
897 // if the condition below doesn't hold anymore we need to modify our special lower casing code in qchar.cpp
898 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(lowerCase));
899 data.p.lowerCaseSpecial = true;
900 data.p.lowerCaseDiff = appendToSpecialCaseMap(QList<int>() << lowerCase);
902 data.p.lowerCaseDiff = diff;
903 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff));
905 if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(lowerCase)) {
906 // if the conditions below doesn't hold anymore we need to modify our lower casing code
907 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
908 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase));
911 // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
912 if (properties[UD_TitleCase].isEmpty())
913 properties[UD_TitleCase] = properties[UD_UpperCase];
914 if (!properties[UD_TitleCase].isEmpty()) {
915 int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
917 int diff = titleCase - codepoint;
918 if (qAbs(diff) >= (1<<14)) {
919 qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << "); map it for special case";
920 // if the condition below doesn't hold anymore we need to modify our special title casing code in qchar.cpp
921 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(titleCase));
922 data.p.titleCaseSpecial = true;
923 data.p.titleCaseDiff = appendToSpecialCaseMap(QList<int>() << titleCase);
925 data.p.titleCaseDiff = diff;
926 maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff));
928 if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(titleCase)) {
929 // if the conditions below doesn't hold anymore we need to modify our title casing code
930 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
931 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase));
935 if (!properties[UD_DigitValue].isEmpty())
936 data.p.digitValue = properties[UD_DigitValue].toInt();
939 QByteArray decomposition = properties[UD_Decomposition];
940 if (!decomposition.isEmpty()) {
941 highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
942 QList<QByteArray> d = decomposition.split(' ');
943 if (d[0].contains('<')) {
944 data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
945 if (data.decompositionType == QChar::NoDecomposition)
946 qFatal("unassigned decomposition type: %s", d[0].constData());
949 data.decompositionType = QChar::Canonical;
951 for (int i = 0; i < d.size(); ++i) {
952 data.decomposition.append(d[i].toInt(&ok, 16));
955 ++decompositionLength[data.decomposition.size()];
958 for (int i = codepoint; i <= lastCodepoint; ++i)
959 unicodeData[i] = data;
963 static int maxMirroredDiff = 0;
965 static void readBidiMirroring()
967 qDebug("Reading BidiMirroring.txt");
969 QFile f("data/BidiMirroring.txt");
971 qFatal("Couldn't find BidiMirroring.txt");
973 f.open(QFile::ReadOnly);
978 int len = f.readLine(line.data(), 1024);
981 int comment = line.indexOf('#');
983 line = line.left(comment);
987 line = line.replace(" ", "");
989 QList<QByteArray> pair = line.split(';');
990 Q_ASSERT(pair.size() == 2);
993 int codepoint = pair[0].toInt(&ok, 16);
995 int mirror = pair[1].toInt(&ok, 16);
998 UnicodeData &d = UnicodeData::valueRef(codepoint);
999 d.mirroredChar = mirror;
1000 d.p.mirrorDiff = d.mirroredChar - codepoint;
1001 maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
1005 static void readArabicShaping()
1007 qDebug("Reading ArabicShaping.txt");
1009 QFile f("data/ArabicShaping.txt");
1011 qFatal("Couldn't find ArabicShaping.txt");
1013 f.open(QFile::ReadOnly);
1015 while (!f.atEnd()) {
1018 int len = f.readLine(line.data(), 1024);
1021 int comment = line.indexOf('#');
1023 line = line.left(comment);
1024 line = line.trimmed();
1029 QList<QByteArray> l = line.split(';');
1030 Q_ASSERT(l.size() == 4);
1033 int codepoint = l[0].toInt(&ok, 16);
1036 Joining joining = joining_map.value(l[2].trimmed(), Joining_Unassigned);
1037 if (joining == Joining_Unassigned)
1038 qFatal("unassigned or unhandled joining value: %s", l[2].constData());
1040 if (joining == Joining_Left) {
1041 // There are currently no characters of joining type Left_Joining defined in Unicode.
1042 qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData());
1045 UnicodeData &d = UnicodeData::valueRef(codepoint);
1046 if (joining == Joining_Right)
1047 d.p.joining = QChar::Right;
1048 else if (joining == Joining_Dual)
1049 d.p.joining = QChar::Dual;
1050 else if (joining == Joining_Causing)
1051 d.p.joining = QChar::Center;
1053 d.p.joining = QChar::OtherJoining;
1057 static void readDerivedAge()
1059 qDebug("Reading DerivedAge.txt");
1061 QFile f("data/DerivedAge.txt");
1063 qFatal("Couldn't find DerivedAge.txt");
1065 f.open(QFile::ReadOnly);
1067 while (!f.atEnd()) {
1070 int len = f.readLine(line.data(), 1024);
1073 int comment = line.indexOf('#');
1075 line = line.left(comment);
1076 line.replace(" ", "");
1081 QList<QByteArray> l = line.split(';');
1082 Q_ASSERT(l.size() == 2);
1084 QByteArray codes = l[0];
1085 codes.replace("..", ".");
1086 QList<QByteArray> cl = codes.split('.');
1089 int from = cl[0].toInt(&ok, 16);
1092 if (cl.size() == 2) {
1093 to = cl[1].toInt(&ok, 16);
1097 QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
1098 //qDebug() << hex << from << ".." << to << ba << age;
1099 if (age == QChar::Unicode_Unassigned)
1100 qFatal("unassigned or unhandled age value: %s", l[1].constData());
1102 for (int codepoint = from; codepoint <= to; ++codepoint) {
1103 UnicodeData &d = UnicodeData::valueRef(codepoint);
1109 static void readDerivedNormalizationProps()
1111 qDebug("Reading DerivedNormalizationProps.txt");
1113 QFile f("data/DerivedNormalizationProps.txt");
1115 qFatal("Couldn't find DerivedNormalizationProps.txt");
1117 f.open(QFile::ReadOnly);
1119 while (!f.atEnd()) {
1122 int len = f.readLine(line.data(), 1024);
1125 int comment = line.indexOf('#');
1127 line = line.left(comment);
1129 if (line.trimmed().isEmpty())
1132 QList<QByteArray> l = line.split(';');
1133 Q_ASSERT(l.size() >= 2);
1135 QByteArray propName = l[1].trimmed();
1136 if (propName != "Full_Composition_Exclusion")
1140 QByteArray codes = l[0].trimmed();
1141 codes.replace("..", ".");
1142 QList<QByteArray> cl = codes.split('.');
1145 int from = cl[0].toInt(&ok, 16);
1148 if (cl.size() == 2) {
1149 to = cl[1].toInt(&ok, 16);
1153 for (int codepoint = from; codepoint <= to; ++codepoint) {
1154 UnicodeData &d = UnicodeData::valueRef(codepoint);
1155 d.excludedComposition = true;
1159 for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
1160 UnicodeData &d = UnicodeData::valueRef(codepoint);
1161 if (!d.excludedComposition
1162 && d.decompositionType == QChar::Canonical
1163 && d.decomposition.size() > 1) {
1164 Q_ASSERT(d.decomposition.size() == 2);
1166 int part1 = d.decomposition.at(0);
1167 int part2 = d.decomposition.at(1);
1169 // all non-starters are listed in DerivedNormalizationProps.txt
1170 // and already excluded from composition
1171 Q_ASSERT(UnicodeData::valueRef(part1).p.combiningClass == 0);
1174 highestLigature = qMax(highestLigature, part1);
1175 Ligature l = { part1, part2, codepoint };
1176 ligatureHashes[part2].append(l);
1182 struct NormalizationCorrection {
1188 static QByteArray createNormalizationCorrections()
1190 qDebug("Reading NormalizationCorrections.txt");
1192 QFile f("data/NormalizationCorrections.txt");
1194 qFatal("Couldn't find NormalizationCorrections.txt");
1196 f.open(QFile::ReadOnly);
1200 out += "struct NormalizationCorrection {\n"
1202 " uint old_mapping;\n"
1206 "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
1209 int numCorrections = 0;
1210 while (!f.atEnd()) {
1213 int len = f.readLine(line.data(), 1024);
1216 int comment = line.indexOf('#');
1218 line = line.left(comment);
1219 line.replace(" ", "");
1224 Q_ASSERT(!line.contains(".."));
1226 QList<QByteArray> fields = line.split(';');
1227 Q_ASSERT(fields.size() == 4);
1229 NormalizationCorrection c = { 0, 0, 0 };
1231 c.codepoint = fields.at(0).toInt(&ok, 16);
1233 c.mapped = fields.at(1).toInt(&ok, 16);
1235 if (fields.at(3) == "3.2.0")
1236 c.version = QChar::Unicode_3_2;
1237 else if (fields.at(3) == "4.0.0")
1238 c.version = QChar::Unicode_4_0;
1240 qFatal("unknown unicode version in NormalizationCorrection.txt");
1242 out += " { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
1243 + ", " + QString::number(c.version) + " },\n";
1245 maxVersion = qMax(c.version, maxVersion);
1250 "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n"
1251 "enum { NormalizationCorrectionsVersionMax = " + QByteArray::number(maxVersion) + " };\n\n";
1256 static void readLineBreak()
1258 qDebug("Reading LineBreak.txt");
1260 QFile f("data/LineBreak.txt");
1262 qFatal("Couldn't find LineBreak.txt");
1264 f.open(QFile::ReadOnly);
1266 while (!f.atEnd()) {
1269 int len = f.readLine(line.data(), 1024);
1272 int comment = line.indexOf('#');
1274 line = line.left(comment);
1275 line.replace(" ", "");
1280 QList<QByteArray> l = line.split(';');
1281 Q_ASSERT(l.size() == 2);
1283 QByteArray codes = l[0];
1284 codes.replace("..", ".");
1285 QList<QByteArray> cl = codes.split('.');
1288 int from = cl[0].toInt(&ok, 16);
1291 if (cl.size() == 2) {
1292 to = cl[1].toInt(&ok, 16);
1296 LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
1297 if (lb == LineBreak_Unassigned)
1298 qFatal("unassigned line break class: %s", l[1].constData());
1300 for (int codepoint = from; codepoint <= to; ++codepoint) {
1301 UnicodeData &d = UnicodeData::valueRef(codepoint);
1302 d.p.line_break_class = lb;
1307 static void readSpecialCasing()
1309 qDebug("Reading SpecialCasing.txt");
1311 QFile f("data/SpecialCasing.txt");
1313 qFatal("Couldn't find SpecialCasing.txt");
1315 f.open(QFile::ReadOnly);
1317 while (!f.atEnd()) {
1320 int len = f.readLine(line.data(), 1024);
1323 int comment = line.indexOf('#');
1325 line = line.left(comment);
1330 QList<QByteArray> l = line.split(';');
1332 QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
1333 if (!condition.isEmpty())
1338 int codepoint = l[0].trimmed().toInt(&ok, 16);
1341 // if the condition below doesn't hold anymore we need to modify our
1342 // lower/upper/title casing code and case folding code
1343 Q_ASSERT(!QChar::requiresSurrogates(codepoint));
1345 // qDebug() << "codepoint" << hex << codepoint;
1346 // qDebug() << line;
1348 QList<QByteArray> lower = l[1].trimmed().split(' ');
1349 QList<int> lowerMap;
1350 for (int i = 0; i < lower.size(); ++i) {
1352 lowerMap.append(lower.at(i).toInt(&ok, 16));
1356 QList<QByteArray> title = l[2].trimmed().split(' ');
1357 QList<int> titleMap;
1358 for (int i = 0; i < title.size(); ++i) {
1360 titleMap.append(title.at(i).toInt(&ok, 16));
1364 QList<QByteArray> upper = l[3].trimmed().split(' ');
1365 QList<int> upperMap;
1366 for (int i = 0; i < upper.size(); ++i) {
1368 upperMap.append(upper.at(i).toInt(&ok, 16));
1373 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1374 Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
1375 Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
1376 Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
1378 if (lowerMap.size() > 1) {
1379 ud.p.lowerCaseSpecial = true;
1380 ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
1382 if (titleMap.size() > 1) {
1383 ud.p.titleCaseSpecial = true;
1384 ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
1386 if (upperMap.size() > 1) {
1387 ud.p.upperCaseSpecial = true;
1388 ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);
1393 static int maxCaseFoldDiff = 0;
1395 static void readCaseFolding()
1397 qDebug("Reading CaseFolding.txt");
1399 QFile f("data/CaseFolding.txt");
1401 qFatal("Couldn't find CaseFolding.txt");
1403 f.open(QFile::ReadOnly);
1405 while (!f.atEnd()) {
1408 int len = f.readLine(line.data(), 1024);
1411 int comment = line.indexOf('#');
1413 line = line.left(comment);
1418 QList<QByteArray> l = line.split(';');
1421 int codepoint = l[0].trimmed().toInt(&ok, 16);
1425 l[1] = l[1].trimmed();
1426 if (l[1] == "F" || l[1] == "T")
1429 // qDebug() << "codepoint" << hex << codepoint;
1430 // qDebug() << line;
1431 QList<QByteArray> fold = l[2].trimmed().split(' ');
1433 for (int i = 0; i < fold.size(); ++i) {
1435 foldMap.append(fold.at(i).toInt(&ok, 16));
1439 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1440 if (foldMap.size() == 1) {
1441 int caseFolded = foldMap.at(0);
1442 int diff = caseFolded - codepoint;
1443 if (qAbs(diff) >= (1<<14)) {
1444 qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << caseFolded << "); map it for special case";
1445 // if the condition below doesn't hold anymore we need to modify our special case folding code in qchar.cpp
1446 Q_ASSERT(!QChar::requiresSurrogates(codepoint) && !QChar::requiresSurrogates(caseFolded));
1447 ud.p.caseFoldSpecial = true;
1448 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1450 ud.p.caseFoldDiff = diff;
1451 maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff));
1453 if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(caseFolded)) {
1454 // if the conditions below doesn't hold anymore we need to modify our case folding code
1455 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded));
1456 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded));
1458 // if (caseFolded != codepoint + ud.p.lowerCaseDiff)
1459 // qDebug() << hex << codepoint;
1461 qFatal("we currently don't support full case foldings");
1462 // qDebug() << "special" << hex << foldMap;
1463 ud.p.caseFoldSpecial = true;
1464 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1469 static void readGraphemeBreak()
1471 qDebug("Reading GraphemeBreakProperty.txt");
1473 QFile f("data/GraphemeBreakProperty.txt");
1475 qFatal("Couldn't find GraphemeBreakProperty.txt");
1477 f.open(QFile::ReadOnly);
1479 while (!f.atEnd()) {
1482 int len = f.readLine(line.data(), 1024);
1485 int comment = line.indexOf('#');
1487 line = line.left(comment);
1488 line.replace(" ", "");
1493 QList<QByteArray> l = line.split(';');
1494 Q_ASSERT(l.size() == 2);
1496 QByteArray codes = l[0];
1497 codes.replace("..", ".");
1498 QList<QByteArray> cl = codes.split('.');
1501 int from = cl[0].toInt(&ok, 16);
1504 if (cl.size() == 2) {
1505 to = cl[1].toInt(&ok, 16);
1509 GraphemeBreak brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
1510 if (brk == GraphemeBreak_Unassigned)
1511 qFatal("unassigned grapheme break class: %s", l[1].constData());
1513 for (int codepoint = from; codepoint <= to; ++codepoint) {
1514 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1515 ud.p.graphemeBreak = brk;
1520 static void readWordBreak()
1522 qDebug("Reading WordBreakProperty.txt");
1524 QFile f("data/WordBreakProperty.txt");
1526 qFatal("Couldn't find WordBreakProperty.txt");
1528 f.open(QFile::ReadOnly);
1530 while (!f.atEnd()) {
1533 int len = f.readLine(line.data(), 1024);
1536 int comment = line.indexOf('#');
1538 line = line.left(comment);
1539 line.replace(" ", "");
1544 QList<QByteArray> l = line.split(';');
1545 Q_ASSERT(l.size() == 2);
1547 QByteArray codes = l[0];
1548 codes.replace("..", ".");
1549 QList<QByteArray> cl = codes.split('.');
1552 int from = cl[0].toInt(&ok, 16);
1555 if (cl.size() == 2) {
1556 to = cl[1].toInt(&ok, 16);
1560 WordBreak brk = word_break_map.value(l[1], WordBreak_Unassigned);
1561 if (brk == WordBreak_Unassigned)
1562 qFatal("unassigned word break class: %s", l[1].constData());
1564 for (int codepoint = from; codepoint <= to; ++codepoint) {
1565 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1566 ud.p.wordBreak = brk;
1571 static void readSentenceBreak()
1573 qDebug("Reading SentenceBreakProperty.txt");
1575 QFile f("data/SentenceBreakProperty.txt");
1577 qFatal("Couldn't find SentenceBreakProperty.txt");
1579 f.open(QFile::ReadOnly);
1581 while (!f.atEnd()) {
1584 int len = f.readLine(line.data(), 1024);
1587 int comment = line.indexOf('#');
1589 line = line.left(comment);
1590 line.replace(" ", "");
1595 QList<QByteArray> l = line.split(';');
1596 Q_ASSERT(l.size() == 2);
1598 QByteArray codes = l[0];
1599 codes.replace("..", ".");
1600 QList<QByteArray> cl = codes.split('.');
1603 int from = cl[0].toInt(&ok, 16);
1606 if (cl.size() == 2) {
1607 to = cl[1].toInt(&ok, 16);
1611 SentenceBreak brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
1612 if (brk == SentenceBreak_Unassigned)
1613 qFatal("unassigned sentence break class: %s", l[1].constData());
1615 for (int codepoint = from; codepoint <= to; ++codepoint) {
1616 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1617 ud.p.sentenceBreak = brk;
1623 // this piece of code does full case folding and comparison. We currently
1624 // don't use it, since this gives lots of issues with things as case insensitive
1625 // search and replace.
1626 static inline void foldCase(uint ch, ushort *out)
1628 const QUnicodeTables::Properties *p = qGetProp(ch);
1629 if (!p->caseFoldSpecial) {
1630 *(out++) = ch + p->caseFoldDiff;
1632 const ushort *folded = specialCaseMap + p->caseFoldDiff;
1633 ushort length = *folded++;
1640 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
1649 while (a != ae && b != be) {
1650 const QUnicodeTables::Properties *pa = qGetProp(*a);
1651 const QUnicodeTables::Properties *pb = qGetProp(*b);
1652 if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1654 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1668 ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1669 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1670 abuf[0] = bbuf[0] = 0;
1676 if (!*bp && b == be)
1680 foldCase(*(a++), abuf);
1686 foldCase(*(b++), bbuf);
1690 return (int)*ap - (int)*bp;
1697 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
1704 while (a != ae && *b) {
1705 const QUnicodeTables::Properties *pa = qGetProp(*a);
1706 const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
1707 if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1709 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1723 ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1724 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1725 abuf[0] = bbuf[0] = 0;
1735 foldCase(*(a++), abuf);
1741 foldCase(*(b++), bbuf);
1745 return (int)*ap - (int)*bp;
1753 static QList<QByteArray> blockNames;
1760 static QList<BlockInfo> blockInfoList;
1762 static void readBlocks()
1764 qDebug("Reading Blocks.txt");
1766 QFile f("data/Blocks.txt");
1768 qFatal("Couldn't find Blocks.txt");
1770 f.open(QFile::ReadOnly);
1772 while (!f.atEnd()) {
1773 QByteArray line = f.readLine();
1774 line.resize(line.size() - 1);
1776 int comment = line.indexOf("#");
1778 line = line.left(comment);
1780 line.replace(" ", "");
1785 int semicolon = line.indexOf(';');
1786 Q_ASSERT(semicolon >= 0);
1787 QByteArray codePoints = line.left(semicolon);
1788 QByteArray blockName = line.mid(semicolon + 1);
1790 int blockIndex = blockNames.indexOf(blockName);
1791 if (blockIndex == -1) {
1792 blockIndex = blockNames.size();
1793 blockNames.append(blockName);
1796 codePoints.replace("..", ".");
1797 QList<QByteArray> cl = codePoints.split('.');
1800 int first = cl[0].toInt(&ok, 16);
1803 if (cl.size() == 2) {
1804 last = cl[1].toInt(&ok, 16);
1808 BlockInfo blockInfo = { blockIndex, first, last };
1809 blockInfoList.append(blockInfo);
1814 static QList<QByteArray> scriptNames;
1815 static QList<int> scriptMap;
1817 static const char *specialScripts[] = {
1848 enum { specialScriptsCount = sizeof(specialScripts) / sizeof(const char *) };
1850 static void readScripts()
1852 qDebug("Reading Scripts.txt");
1854 QFile f("data/Scripts.txt");
1856 qFatal("Couldn't find Scripts.txt");
1858 f.open(QFile::ReadOnly);
1860 int scriptsCount = specialScriptsCount;
1861 // ### preserve the old ordering (temporary)
1862 for (int i = 0; i < specialScriptsCount; ++i) {
1863 scriptNames.append(specialScripts[i]);
1864 scriptMap.append(i);
1867 while (!f.atEnd()) {
1868 QByteArray line = f.readLine();
1869 line.resize(line.size() - 1);
1871 int comment = line.indexOf("#");
1873 line = line.left(comment);
1875 line.replace(" ", "");
1876 line.replace("_", "");
1881 int semicolon = line.indexOf(';');
1882 Q_ASSERT(semicolon >= 0);
1883 QByteArray codePoints = line.left(semicolon);
1884 QByteArray scriptName = line.mid(semicolon + 1);
1886 codePoints.replace("..", ".");
1887 QList<QByteArray> cl = codePoints.split('.');
1890 int first = cl[0].toInt(&ok, 16);
1893 if (cl.size() == 2) {
1894 last = cl[1].toInt(&ok, 16);
1898 int scriptIndex = scriptNames.indexOf(scriptName);
1899 if (scriptIndex == -1) {
1900 scriptIndex = scriptNames.size();
1901 scriptNames.append(scriptName);
1903 // is the script alias for 'Common'?
1904 int s = specialScriptsCount;
1906 if (scriptName == specialScripts[s])
1909 scriptMap.append(s > 0 ? scriptsCount++ : 0);
1912 for (int codepoint = first; codepoint <= last; ++codepoint) {
1913 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1914 ud.p.script = scriptMap.at(scriptIndex);
1919 static QByteArray createScriptEnumDeclaration()
1921 QByteArray declaration;
1923 declaration += "// See http://www.unicode.org/reports/tr24/tr24-5.html\n";
1924 declaration += "enum Script {\n Common";
1926 // output the ones with special processing first
1927 for (int i = 1; i < scriptNames.size(); ++i) {
1928 if (scriptMap.at(i) == 0)
1930 declaration += ",\n ";
1931 declaration += scriptNames.at(i);
1933 declaration += ",\n ScriptCount = Inherited";
1935 // output the ones that are an alias for 'Common'
1936 for (int i = 1; i < scriptNames.size(); ++i) {
1937 if (scriptMap.at(i) != 0)
1939 declaration += ",\n ";
1940 declaration += scriptNames.at(i);
1941 declaration += " = Common";
1944 declaration += "\n};\n\n";
1950 static void dump(int from, int to)
1952 for (int i = from; i <= to; ++i) {
1953 UnicodeData &d = UnicodeData::valueRef(i);
1954 qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
1955 i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
1956 if (d.decompositionType != QChar::NoDecomposition) {
1957 qDebug(" decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
1958 d.decomposition[0]);
1965 static QList<PropertyFlags> uniqueProperties;
1967 static void computeUniqueProperties()
1969 qDebug("computeUniqueProperties:");
1970 for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
1971 UnicodeData &d = UnicodeData::valueRef(codepoint);
1972 int index = uniqueProperties.indexOf(d.p);
1974 index = uniqueProperties.size();
1975 uniqueProperties.append(d.p);
1977 d.propertyIndex = index;
1979 qDebug(" %d unique unicode properties found", uniqueProperties.size());
1982 struct UniqueBlock {
1983 inline UniqueBlock() : index(-1) {}
1985 inline bool operator==(const UniqueBlock &other) const
1986 { return values == other.values; }
1989 QVector<int> values;
1992 static QByteArray createPropertyInfo()
1994 qDebug("createPropertyInfo:");
1996 // we reserve one bit more than in the assert below for the sign
1997 Q_ASSERT(maxMirroredDiff < (1<<12));
1998 Q_ASSERT(maxLowerCaseDiff < (1<<14));
1999 Q_ASSERT(maxUpperCaseDiff < (1<<14));
2000 Q_ASSERT(maxTitleCaseDiff < (1<<14));
2001 Q_ASSERT(maxCaseFoldDiff < (1<<14));
2003 const int BMP_BLOCKSIZE = 32;
2004 const int BMP_SHIFT = 5;
2005 const int BMP_END = 0x11000;
2006 const int SMP_END = 0x110000;
2007 const int SMP_BLOCKSIZE = 256;
2008 const int SMP_SHIFT = 8;
2010 QList<UniqueBlock> uniqueBlocks;
2011 QVector<int> blockMap;
2014 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2016 b.values.reserve(BMP_BLOCKSIZE);
2017 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2018 int uc = block*BMP_BLOCKSIZE + i;
2019 UnicodeData &d = UnicodeData::valueRef(uc);
2020 b.values.append(d.propertyIndex);
2022 int index = uniqueBlocks.indexOf(b);
2024 index = uniqueBlocks.size();
2026 used += BMP_BLOCKSIZE;
2027 uniqueBlocks.append(b);
2029 blockMap.append(uniqueBlocks.at(index).index);
2031 int bmp_blocks = uniqueBlocks.size();
2033 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2035 b.values.reserve(SMP_BLOCKSIZE);
2036 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2037 int uc = block*SMP_BLOCKSIZE + i;
2038 UnicodeData &d = UnicodeData::valueRef(uc);
2039 b.values.append(d.propertyIndex);
2041 int index = uniqueBlocks.indexOf(b);
2043 index = uniqueBlocks.size();
2045 used += SMP_BLOCKSIZE;
2046 uniqueBlocks.append(b);
2048 blockMap.append(uniqueBlocks.at(index).index);
2050 int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2052 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2053 int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2054 int bmp_mem = bmp_block_data + bmp_trie;
2055 qDebug(" %d unique blocks in BMP.", bmp_blocks);
2056 qDebug(" block data uses: %d bytes", bmp_block_data);
2057 qDebug(" trie data uses : %d bytes", bmp_trie);
2059 int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2060 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2061 int smp_mem = smp_block_data + smp_trie;
2062 qDebug(" %d unique blocks in SMP.", smp_blocks);
2063 qDebug(" block data uses: %d bytes", smp_block_data);
2064 qDebug(" trie data uses : %d bytes", smp_trie);
2066 int prop_data = uniqueProperties.size() * SizeOfPropertiesStruct;
2067 qDebug("\n properties data uses : %d bytes", prop_data);
2068 qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + prop_data);
2070 Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2074 out += "static const unsigned short uc_property_trie[] = {\n";
2075 // first write the map
2076 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2077 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2079 if (out.endsWith(' '))
2081 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2085 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2088 if (out.endsWith(' '))
2090 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2091 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2093 if (out.endsWith(' '))
2095 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2099 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2102 if (out.endsWith(' '))
2106 for (int i = 0; i < uniqueBlocks.size(); ++i) {
2107 if (out.endsWith(' '))
2110 const UniqueBlock &b = uniqueBlocks.at(i);
2111 for (int j = 0; j < b.values.size(); ++j) {
2113 if (out.endsWith(' '))
2117 out += QByteArray::number(b.values.at(j));
2121 if (out.endsWith(' '))
2125 out += "#define GET_PROP_INDEX(ucs4) \\\n"
2126 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2127 " ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2128 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2129 " : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2130 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2131 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
2132 "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
2133 " (uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
2134 "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n";
2136 out += "static const Properties uc_properties[] = {";
2137 // keep in sync with the property declaration
2138 for (int i = 0; i < uniqueProperties.size(); ++i) {
2139 const PropertyFlags &p = uniqueProperties.at(i);
2141 // " ushort category : 8; /* 5 used */\n"
2142 out += QByteArray::number( p.category );
2144 // " ushort direction : 8; /* 5 used */\n"
2145 out += QByteArray::number( p.direction );
2147 // " ushort combiningClass : 8;\n"
2148 out += QByteArray::number( p.combiningClass );
2150 // " ushort joining : 2;\n"
2151 out += QByteArray::number( p.joining );
2153 // " signed short digitValue : 6; /* 5 used */\n"
2154 out += QByteArray::number( p.digitValue );
2156 // " signed short mirrorDiff : 16;\n"
2157 // " signed short lowerCaseDiff : 16;\n"
2158 // " signed short upperCaseDiff : 16;\n"
2159 // " signed short titleCaseDiff : 16;\n"
2160 // " signed short caseFoldDiff : 16;\n"
2161 out += QByteArray::number( p.mirrorDiff );
2163 out += QByteArray::number( p.lowerCaseDiff );
2165 out += QByteArray::number( p.upperCaseDiff );
2167 out += QByteArray::number( p.titleCaseDiff );
2169 out += QByteArray::number( p.caseFoldDiff );
2171 // " ushort lowerCaseSpecial : 1;\n"
2172 // " ushort upperCaseSpecial : 1;\n"
2173 // " ushort titleCaseSpecial : 1;\n"
2174 // " ushort caseFoldSpecial : 1;\n"
2175 out += QByteArray::number( p.lowerCaseSpecial );
2177 out += QByteArray::number( p.upperCaseSpecial );
2179 out += QByteArray::number( p.titleCaseSpecial );
2181 out += QByteArray::number( p.caseFoldSpecial );
2183 // " ushort unicodeVersion : 4;\n"
2184 out += QByteArray::number( p.age );
2186 // " ushort graphemeBreak : 8; /* 4 used */\n"
2187 // " ushort wordBreak : 8; /* 4 used */\n"
2188 // " ushort sentenceBreak : 8; /* 4 used */\n"
2189 // " ushort line_break_class : 8; /* 6 used */\n"
2190 out += QByteArray::number( p.graphemeBreak );
2192 out += QByteArray::number( p.wordBreak );
2194 out += QByteArray::number( p.sentenceBreak );
2196 out += QByteArray::number( p.line_break_class );
2198 // " ushort script : 8; /* 5 used */\n"
2199 out += QByteArray::number( p.script );
2206 out += "static inline const Properties *qGetProp(uint ucs4)\n"
2208 " const int index = GET_PROP_INDEX(ucs4);\n"
2209 " return uc_properties + index;\n"
2212 "static inline const Properties *qGetProp(ushort ucs2)\n"
2214 " const int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2215 " return uc_properties + index;\n"
2218 "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4)\n"
2220 " return qGetProp(ucs4);\n"
2223 "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2)\n"
2225 " return qGetProp(ucs2);\n"
2228 out += "Q_CORE_EXPORT GraphemeBreak QT_FASTCALL graphemeBreakClass(uint ucs4)\n"
2230 " return (GraphemeBreak)qGetProp(ucs4)->graphemeBreak;\n"
2233 "Q_CORE_EXPORT WordBreak QT_FASTCALL wordBreakClass(uint ucs4)\n"
2235 " return (WordBreak)qGetProp(ucs4)->wordBreak;\n"
2238 "Q_CORE_EXPORT SentenceBreak QT_FASTCALL sentenceBreakClass(uint ucs4)\n"
2240 " return (SentenceBreak)qGetProp(ucs4)->sentenceBreak;\n"
2243 "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4)\n"
2245 " return (LineBreakClass)qGetProp(ucs4)->line_break_class;\n"
2248 "Q_CORE_EXPORT Script QT_FASTCALL script(uint ucs4)\n"
2250 " return (Script)qGetProp(ucs4)->script;\n"
2256 static QByteArray createSpecialCaseMap()
2258 qDebug("createSpecialCaseMap:");
2262 out += "static const ushort specialCaseMap[] = {\n"
2263 " 0x0, // placeholder";
2265 while (i < specialCaseMap.size()) {
2267 int n = specialCaseMap.at(i);
2268 for (int j = 0; j <= n; ++j) {
2269 out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i+j), 16);
2277 qDebug(" memory usage: %d bytes", specialCaseMap.size()*sizeof(unsigned short));
2283 static QByteArray createCompositionInfo()
2285 qDebug("createCompositionInfo: highestComposedCharacter=0x%x", highestComposedCharacter);
2287 const int BMP_BLOCKSIZE = 16;
2288 const int BMP_SHIFT = 4;
2289 const int BMP_END = 0x3400; // start of Han
2290 const int SMP_END = 0x30000;
2291 const int SMP_BLOCKSIZE = 256;
2292 const int SMP_SHIFT = 8;
2294 if (SMP_END <= highestComposedCharacter)
2295 qFatal("end of table smaller than highest composed character 0x%x", highestComposedCharacter);
2297 QVector<unsigned short> decompositions;
2300 QList<UniqueBlock> uniqueBlocks;
2301 QVector<int> blockMap;
2304 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2306 b.values.reserve(BMP_BLOCKSIZE);
2307 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2308 int uc = block*BMP_BLOCKSIZE + i;
2309 UnicodeData &d = UnicodeData::valueRef(uc);
2310 if (!d.decomposition.isEmpty()) {
2311 int utf16Length = 0;
2312 decompositions.append(0);
2313 for (int j = 0; j < d.decomposition.size(); ++j) {
2314 int code = d.decomposition.at(j);
2315 if (QChar::requiresSurrogates(code)) {
2316 // save as surrogate pair
2317 decompositions.append(QChar::highSurrogate(code));
2318 decompositions.append(QChar::lowSurrogate(code));
2321 decompositions.append(code);
2325 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2326 b.values.append(tableIndex);
2327 tableIndex += utf16Length + 1;
2329 b.values.append(0xffff);
2332 int index = uniqueBlocks.indexOf(b);
2334 index = uniqueBlocks.size();
2336 used += BMP_BLOCKSIZE;
2337 uniqueBlocks.append(b);
2339 blockMap.append(uniqueBlocks.at(index).index);
2341 int bmp_blocks = uniqueBlocks.size();
2343 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2345 b.values.reserve(SMP_BLOCKSIZE);
2346 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2347 int uc = block*SMP_BLOCKSIZE + i;
2348 UnicodeData &d = UnicodeData::valueRef(uc);
2349 if (!d.decomposition.isEmpty()) {
2350 int utf16Length = 0;
2351 decompositions.append(0);
2352 for (int j = 0; j < d.decomposition.size(); ++j) {
2353 int code = d.decomposition.at(j);
2354 if (QChar::requiresSurrogates(code)) {
2355 // save as surrogate pair
2356 decompositions.append(QChar::highSurrogate(code));
2357 decompositions.append(QChar::lowSurrogate(code));
2360 decompositions.append(code);
2364 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2365 b.values.append(tableIndex);
2366 tableIndex += utf16Length + 1;
2368 b.values.append(0xffff);
2371 int index = uniqueBlocks.indexOf(b);
2373 index = uniqueBlocks.size();
2375 used += SMP_BLOCKSIZE;
2376 uniqueBlocks.append(b);
2378 blockMap.append(uniqueBlocks.at(index).index);
2380 int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2382 // if the condition below doesn't hold anymore we need to modify our decomposition code
2383 Q_ASSERT(tableIndex < 0xffff);
2385 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2386 int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2387 int bmp_mem = bmp_block_data + bmp_trie;
2388 qDebug(" %d unique blocks in BMP.", bmp_blocks);
2389 qDebug(" block data uses: %d bytes", bmp_block_data);
2390 qDebug(" trie data uses : %d bytes", bmp_trie);
2392 int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2393 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2394 int smp_mem = smp_block_data + smp_trie;
2395 qDebug(" %d unique blocks in SMP.", smp_blocks);
2396 qDebug(" block data uses: %d bytes", smp_block_data);
2397 qDebug(" trie data uses : %d bytes", smp_trie);
2399 int decomposition_data = decompositions.size() * 2;
2400 qDebug("\n decomposition data uses : %d bytes", decomposition_data);
2401 qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + decomposition_data);
2403 Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2407 out += "static const unsigned short uc_decomposition_trie[] = {\n";
2408 // first write the map
2409 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2410 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2412 if (out.endsWith(' '))
2414 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2418 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2421 if (out.endsWith(' '))
2423 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2424 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2426 if (out.endsWith(' '))
2428 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2432 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2435 if (out.endsWith(' '))
2439 for (int i = 0; i < uniqueBlocks.size(); ++i) {
2440 if (out.endsWith(' '))
2443 const UniqueBlock &b = uniqueBlocks.at(i);
2444 for (int j = 0; j < b.values.size(); ++j) {
2446 if (out.endsWith(' '))
2450 out += "0x" + QByteArray::number(b.values.at(j), 16);
2454 if (out.endsWith(' '))
2458 out += "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2459 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2460 " ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2461 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2462 " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2463 " ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2464 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2465 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2468 out += "static const unsigned short uc_decomposition_map[] = {";
2469 for (int i = 0; i < decompositions.size(); ++i) {
2471 if (out.endsWith(' '))
2475 out += "0x" + QByteArray::number(decompositions.at(i), 16);
2478 if (out.endsWith(' '))
2485 static QByteArray createLigatureInfo()
2487 qDebug("createLigatureInfo: numLigatures=%d, highestLigature=0x%x", numLigatures, highestLigature);
2489 for (int i = 0; i < ligatureHashes.size(); ++i) {
2490 const QList<Ligature> &l = ligatureHashes.value(i);
2491 for (int j = 0; j < l.size(); ++j) {
2492 // if the condition below doesn't hold anymore we need to modify our ligatureHelper code
2493 Q_ASSERT(QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).ligature) &&
2494 QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).u1));
2498 const int BMP_BLOCKSIZE = 32;
2499 const int BMP_SHIFT = 5;
2500 const int BMP_END = 0x3100;
2501 const int SMP_END = 0x12000;
2502 const int SMP_BLOCKSIZE = 256;
2503 const int SMP_SHIFT = 8;
2505 if (SMP_END <= highestLigature)
2506 qFatal("end of table smaller than highest ligature character 0x%x", highestLigature);
2508 QList<unsigned short> ligatures;
2511 QList<UniqueBlock> uniqueBlocks;
2512 QVector<int> blockMap;
2515 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2517 b.values.reserve(BMP_BLOCKSIZE);
2518 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2519 int uc = block*BMP_BLOCKSIZE + i;
2520 QList<Ligature> l = ligatureHashes.value(uc);
2522 Q_ASSERT(!QChar::requiresSurrogates(uc));
2523 qSort(l); // needed for bsearch in ligatureHelper code
2525 ligatures.append(l.size());
2526 for (int j = 0; j < l.size(); ++j) {
2527 ligatures.append(l.at(j).u1);
2528 ligatures.append(l.at(j).ligature);
2530 b.values.append(tableIndex);
2531 tableIndex += 2*l.size() + 1;
2533 b.values.append(0xffff);
2536 int index = uniqueBlocks.indexOf(b);
2538 index = uniqueBlocks.size();
2540 used += BMP_BLOCKSIZE;
2541 uniqueBlocks.append(b);
2543 blockMap.append(uniqueBlocks.at(index).index);
2545 int bmp_blocks = uniqueBlocks.size();
2547 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2549 b.values.reserve(SMP_BLOCKSIZE);
2550 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2551 int uc = block*SMP_BLOCKSIZE + i;
2552 QList<Ligature> l = ligatureHashes.value(uc);
2554 Q_ASSERT(QChar::requiresSurrogates(uc));
2555 qSort(l); // needed for bsearch in ligatureHelper code
2557 ligatures.append(l.size());
2558 for (int j = 0; j < l.size(); ++j) {
2559 ligatures.append(QChar::highSurrogate(l.at(j).u1));
2560 ligatures.append(QChar::lowSurrogate(l.at(j).u1));
2561 ligatures.append(QChar::highSurrogate(l.at(j).ligature));
2562 ligatures.append(QChar::lowSurrogate(l.at(j).ligature));
2564 b.values.append(tableIndex);
2565 tableIndex += 4*l.size() + 1;
2567 b.values.append(0xffff);
2570 int index = uniqueBlocks.indexOf(b);
2572 index = uniqueBlocks.size();
2574 used += SMP_BLOCKSIZE;
2575 uniqueBlocks.append(b);
2577 blockMap.append(uniqueBlocks.at(index).index);
2579 int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2581 // if the condition below doesn't hold anymore we need to modify our composition code
2582 Q_ASSERT(tableIndex < 0xffff);
2584 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2585 int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2586 int bmp_mem = bmp_block_data + bmp_trie;
2587 qDebug(" %d unique blocks in BMP.", bmp_blocks);
2588 qDebug(" block data uses: %d bytes", bmp_block_data);
2589 qDebug(" trie data uses : %d bytes", bmp_trie);
2591 int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2592 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2593 int smp_mem = smp_block_data + smp_trie;
2594 qDebug(" %d unique blocks in SMP.", smp_blocks);
2595 qDebug(" block data uses: %d bytes", smp_block_data);
2596 qDebug(" trie data uses : %d bytes", smp_trie);
2598 int ligature_data = ligatures.size() * 2;
2599 qDebug("\n ligature data uses : %d bytes", ligature_data);
2600 qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + ligature_data);
2602 Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2606 out += "static const unsigned short uc_ligature_trie[] = {\n";
2607 // first write the map
2608 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2609 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2611 if (out.endsWith(' '))
2613 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2617 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2620 if (out.endsWith(' '))
2622 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2623 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2625 if (out.endsWith(' '))
2627 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2631 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2634 if (out.endsWith(' '))
2638 for (int i = 0; i < uniqueBlocks.size(); ++i) {
2639 if (out.endsWith(' '))
2642 const UniqueBlock &b = uniqueBlocks.at(i);
2643 for (int j = 0; j < b.values.size(); ++j) {
2645 if (out.endsWith(' '))
2649 out += "0x" + QByteArray::number(b.values.at(j), 16);
2653 if (out.endsWith(' '))
2657 out += "#define GET_LIGATURE_INDEX(ucs4) \\\n"
2658 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2659 " ? (uc_ligature_trie[uc_ligature_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2660 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2661 " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2662 " ? uc_ligature_trie[uc_ligature_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2663 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2664 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2667 out += "static const unsigned short uc_ligature_map[] = {";
2668 for (int i = 0; i < ligatures.size(); ++i) {
2670 if (out.endsWith(' '))
2674 out += "0x" + QByteArray::number(ligatures.at(i), 16);
2677 if (out.endsWith(' '))
2684 QByteArray createCasingInfo()
2688 out += "struct CasingInfo {\n"
2689 " uint codePoint : 16;\n"
2690 " uint flags : 8;\n"
2691 " uint offset : 8;\n"
2698 int main(int, char **)
2702 initDecompositionMap();
2705 initGraphemeBreak();
2707 initSentenceBreak();
2711 readBidiMirroring();
2712 readArabicShaping();
2714 readDerivedNormalizationProps();
2715 readSpecialCasing();
2719 readGraphemeBreak();
2721 readSentenceBreak();
2724 computeUniqueProperties();
2725 QByteArray properties = createPropertyInfo();
2726 QByteArray specialCases = createSpecialCaseMap();
2727 QByteArray compositions = createCompositionInfo();
2728 QByteArray ligatures = createLigatureInfo();
2729 QByteArray normalizationCorrections = createNormalizationCorrections();
2730 QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
2733 "/****************************************************************************\n"
2735 "** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).\n"
2736 "** Contact: http://www.qt-project.org/\n"
2738 "** This file is part of the QtCore module of the Qt Toolkit.\n"
2740 "** $QT_BEGIN_LICENSE:LGPL$\n"
2741 "** GNU Lesser General Public License Usage\n"
2742 "** This file may be used under the terms of the GNU Lesser General Public\n"
2743 "** License version 2.1 as published by the Free Software Foundation and\n"
2744 "** appearing in the file LICENSE.LGPL included in the packaging of this\n"
2745 "** file. Please review the following information to ensure the GNU Lesser\n"
2746 "** General Public License version 2.1 requirements will be met:\n"
2747 "** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
2749 "** In addition, as a special exception, Nokia gives you certain additional\n"
2750 "** rights. These rights are described in the Nokia Qt LGPL Exception\n"
2751 "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
2753 "** GNU General Public License Usage\n"
2754 "** Alternatively, this file may be used under the terms of the GNU General\n"
2755 "** Public License version 3.0 as published by the Free Software Foundation\n"
2756 "** and appearing in the file LICENSE.GPL included in the packaging of this\n"
2757 "** file. Please review the following information to ensure the GNU General\n"
2758 "** Public License version 3.0 requirements will be met:\n"
2759 "** http://www.gnu.org/copyleft/gpl.html.\n"
2762 "** Alternatively, this file may be used in accordance with the terms and\n"
2763 "** conditions contained in a signed written agreement between you and Nokia.\n"
2770 "** $QT_END_LICENSE$\n"
2772 "****************************************************************************/\n\n";
2775 "/* This file is autogenerated from the Unicode "DATA_VERSION_S" database. Do not edit */\n\n";
2777 QByteArray warning =
2779 "// W A R N I N G\n"
2780 "// -------------\n"
2782 "// This file is not part of the Qt API. It exists for the convenience\n"
2783 "// of internal files. This header file may change from version to version\n"
2784 "// without notice, or even be removed.\n"
2789 QFile f("../../src/corelib/tools/qunicodetables.cpp");
2790 f.open(QFile::WriteOnly|QFile::Truncate);
2793 f.write("#include \"qunicodetables_p.h\"\n\n");
2794 f.write("QT_BEGIN_NAMESPACE\n\n");
2795 f.write("namespace QUnicodeTables {\n\n");
2796 f.write(properties);
2798 f.write(specialCases);
2800 f.write(compositions);
2803 f.write(normalizationCorrections);
2804 f.write("} // namespace QUnicodeTables\n\n");
2805 f.write("using namespace QUnicodeTables;\n\n");
2806 f.write("QT_END_NAMESPACE\n");
2809 f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
2810 f.open(QFile::WriteOnly | QFile::Truncate);
2814 f.write("#ifndef QUNICODETABLES_P_H\n"
2815 "#define QUNICODETABLES_P_H\n\n"
2816 "#include <QtCore/qchar.h>\n\n"
2817 "QT_BEGIN_NAMESPACE\n\n");
2818 f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n");
2819 f.write("namespace QUnicodeTables {\n\n");
2820 f.write(property_string);
2821 f.write(scriptEnumDeclaration);
2822 f.write(grapheme_break_string);
2823 f.write(word_break_string);
2824 f.write(sentence_break_string);
2825 f.write(line_break_class_string);
2827 f.write("} // namespace QUnicodeTables\n\n"
2828 "QT_END_NAMESPACE\n\n"
2829 "#endif // QUNICODETABLES_P_H\n");
2832 qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff;
2833 qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
2834 qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
2835 qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
2836 qDebug() << "maxCaseFoldDiff = " << hex << maxCaseFoldDiff;
2839 // dump(0x620, 0x640);
2840 // dump(0x10000, 0x10020);
2841 // dump(0x10800, 0x10820);
2843 qDebug("decompositionLength used:");
2844 int totalcompositions = 0;
2846 for (int i = 1; i < 20; ++i) {
2847 qDebug(" length %d used %d times", i, decompositionLength.value(i, 0));
2848 totalcompositions += i*decompositionLength.value(i, 0);
2849 sum += decompositionLength.value(i, 0);
2851 qDebug(" len decomposition map %d, average length %f, num composed chars %d",
2852 totalcompositions, (float)totalcompositions/(float)sum, sum);
2853 qDebug("highest composed character %x", highestComposedCharacter);
2854 qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
2856 qBubbleSort(ligatures);
2857 for (int i = 0; i < ligatures.size(); ++i)
2858 qDebug("%s", ligatures.at(i).data());
2860 // qDebug("combiningClass usage:");
2861 // int numClasses = 0;
2862 // for (int i = 0; i < 255; ++i) {
2863 // int num = combiningClassUsage.value(i, 0);
2866 // qDebug(" combiningClass %d used %d times", i, num);
2869 // qDebug("total of %d combining classes used", numClasses);